###############################
# MSc thesis Rianne Kraakman
# January-May 2021
# Part 3: Convergent validity analyses
# Data preparation
###############################


##### Open data

library(haven)
DBEWEEG_selfreport <- read_sav("F:/Thesis/ResearchArchive_MScThesisRianneKraakman/3_ConvergentValidityAnalyses/3.1_data_preparation/Data/DBEWEEG_eindbestand volledige set van alle respondenten_met self-report_recoded_analyse.sav")

#AP and personal tracker data of the respondents with the I&O questions on self-report
DBEWEEG_A_APdata <- read_sav("F:/Thesis/ResearchArchive_MScThesisRianneKraakman/3_ConvergentValidityAnalyses/3.1_data_preparation/Data/20201808 DBEWEEG plus APdata Avragentotaal.sav")

#AP and personal tracker data of the respondents with the SQUASH for self-report
DBEWEEG_B_APdata <- read_sav("F:/Thesis/ResearchArchive_MScThesisRianneKraakman/3_ConvergentValidityAnalyses/3.1_data_preparation/Data/20201808 DBEWEEG plus APdata Bvragentotaal.sav")


##### Delete unnecessary variables #####

#check which variables are in the data (survey)
colnames(DBEWEEG_selfreport)

#select only variables indicating physical activity and person characteristics
DBEWEEG_selfreport2 <- subset(DBEWEEG_selfreport, select = c(Random1, Random2, GESLACHT, IOOPLEIDING, LEEFTIJD, PROVC, LFT4, 
                                                            Batch1_Inv, Batch2_Inv, Batch1_Compl, Batch2_Compl, Resp, 
                                                            MZ_MWK, totmod, Ind_MinWk_150, Ind_MinWk_150_2dgn , Ind_MinWk, totBTSP,                        
                                                            Ind_BotSpier,totbalans, Ind_Balans , KI_RLBEW2017_150 , KI_RLBEW2017 , KI_RLBEW2017_150_2dgn  ,          
                                                             sb_mwk ,l_mwk, l_mwk_compleet, totBTSP_zw  , totspier   , Ind_BotSpier_zw   ,KI_RLBEW2017_zw  ,         
                                                            KIsporter,  totmod_compleet, Ind_MinWk_compleet , Ind_MinWk_150_compleet,        
                                                            Ind_MinWk_150_2dgn_compleet,  Ind_BotSpier_compleet ,Ind_BotSpier_zw_compleet , Ind_Balans_compleet ,       
                                                             KI_RLBEW2017_compleet, KI_RLBEW2017_150_compleet ,KI_RLBEW2017_150_2dgn_compleet , KI_RLBEW2017_zw_compleet,    
                                                           KI_RLBEW2017_150_zw_compleet , KI_RLBEW2017_150_2dgn_zw_compleet,KIsporter_compleet,  totmwk_compleet ,           
                                                           education, V2e , V2e_Open, possession ,  usage, B5_1_1))

#select only those respondents that participated in the follow-up
DBEWEEG_selfreport3 <- DBEWEEG_selfreport2[DBEWEEG_selfreport2$Batch1_Compl == 1 | DBEWEEG_selfreport2$Batch2_Compl == 1,]


#check which variables are in the data (AP data)
colnames(DBEWEEG_A_APdata)

#select only variables of PA measurement and person characteristics
DBEWEEG_A_APdata2 <- subset(DBEWEEG_A_APdata, select = c(Random1, Random2, PALnr, Respnr, B5_1_1, DBEWEEG_id, GESLACHT, IOOPLEIDING, PROVC, LFT4, 
                                                            SerialNumber, conditie, 141:196,  210:523, VI2, VI3))

#check which variables are in the data (AP data)
colnames(DBEWEEG_B_APdata)

#select only variables of PA measurement and person characteristics
DBEWEEG_B_APdata2 <- subset(DBEWEEG_B_APdata, select = c(Random1, Random2, PALnr, Respnr, B5_1_1, DBEWEEG_id, GESLACHT, IOOPLEIDING, PROVC, LFT4, 
                                                         SerialNumber, conditie, 133:188,  202:515, VI2, VI3))

  

##### Combine datasets

#First combine the AP data for the I&O and SQUASH questions
#To join the two data frames vertically, we can use the rbind function. 
#For this, the datasets must have the same variables.

#Check if all variables are shared
#install.packages("arsenal")
library(arsenal)
summary(comparedf(DBEWEEG_A_APdata2, DBEWEEG_B_APdata2))
#All variables in the two datasets are the same. They can be combined. 

#Combine the datasets
DBEWEEG_APdata <- rbind(DBEWEEG_A_APdata2, DBEWEEG_B_APdata2)

#Now we want to combine the self-report data and the data from the follow-up study. 
#Inspect variables the datasets have in common
table(DBEWEEG_APdata$GESLACHT)
table(DBEWEEG_selfreport3$GESLACHT)
table(DBEWEEG_APdata$IOOPLEIDING)
table(DBEWEEG_selfreport3$IOOPLEIDING)
table(DBEWEEG_APdata$LFT4)
table(DBEWEEG_selfreport3$LFT4)
table(DBEWEEG_APdata$PROVC)
table(DBEWEEG_selfreport3$PROVC)
table(DBEWEEG_APdata$Random1)
table(DBEWEEG_selfreport3$Random1)
table(DBEWEEG_APdata$B5_1_1)
table(DBEWEEG_selfreport3$B5_1_1)

#Create a variable that is unique for every respondent
DBEWEEG_APdata$key <-  with(DBEWEEG_APdata, paste0(GESLACHT, IOOPLEIDING, LFT4, PROVC, Random1, B5_1_1))
table(DBEWEEG_APdata$key)

DBEWEEG_selfreport3$key <- with(DBEWEEG_selfreport3, paste0(GESLACHT, IOOPLEIDING, LFT4, PROVC, Random1, B5_1_1))
table(DBEWEEG_selfreport3$key)

#Compare datasets
summary(comparedf(DBEWEEG_APdata, DBEWEEG_selfreport3))

#Now we can join the datasets by the key variable
DBEWEEG_complete <- merge(DBEWEEG_APdata, DBEWEEG_selfreport3, by = "key")

#Check if it is correct
subset(DBEWEEG_complete, select = c(GESLACHT.x, GESLACHT.y, LFT4.x, LFT4.y, IOOPLEIDING.x, IOOPLEIDING.y, PROVC.x, PROVC.y, B5_1_1.x, B5_1_1.y))
table(DBEWEEG_complete$Respnr)
table(DBEWEEG_complete$Resp)



##### Delete more unnecessary variables and rename

#Delete variables that are in the dataset twice
DBEWEEG_complete <- subset(DBEWEEG_complete, select = -c(GESLACHT.x, LFT4.x, IOOPLEIDING.x, PROVC.x, Random1.x, Random2.x, B5_1_1.x, B5_1_1.y))

#Rename to 'normal' name
colnames(DBEWEEG_complete)[colnames(DBEWEEG_complete) == 'GESLACHT.y'] <- 'GESLACHT'
colnames(DBEWEEG_complete)[colnames(DBEWEEG_complete) == 'LFT4.y'] <- 'LFT4'
colnames(DBEWEEG_complete)[colnames(DBEWEEG_complete) == 'IOOPLEIDING.y'] <- 'IOOPLEIDING'
colnames(DBEWEEG_complete)[colnames(DBEWEEG_complete) == 'PROVC.y'] <- 'PROVC'
colnames(DBEWEEG_complete)[colnames(DBEWEEG_complete) == 'Random1.y'] <- 'Random1'
colnames(DBEWEEG_complete)[colnames(DBEWEEG_complete) == 'Random2.y'] <- 'Random2'

#We only need sleep, step count and active minutes of the personal activity trackers
DBEWEEG_complete <- subset(DBEWEEG_complete, select = -c(V11_afgelegde_afstand_Ma_6_juli, V11_calorieverbranding_Ma_6_juli, V11_hartslag_Ma_6_juli, 
                                                         V11_gemiddelde_snelheid_Ma_6_juli, V11_aantal_gelopen_traptreden_Ma_6_juli,
                                                         V11_afgelegde_afstand_Di_7_juli, V11_calorieverbranding_Di_7_juli, V11_hartslag_Di_7_juli, 
                                                         V11_gemiddelde_snelheid_Di_7_juli, V11_aantal_gelopen_traptreden_Di_7_juli,
                                                         V11_afgelegde_afstand_Wo_8_juli, V11_calorieverbranding_Wo_8_juli, V11_hartslag_Wo_8_juli, 
                                                         V11_gemiddelde_snelheid_Wo_8_juli, V11_aantal_gelopen_traptreden_Wo_8_juli,
                                                         V11_afgelegde_afstand_Do_9_juli, V11_calorieverbranding_Do_9_juli, V11_hartslag_Do_9_juli, 
                                                         V11_gemiddelde_snelheid_Do_9_juli, V11_aantal_gelopen_traptreden_Do_9_juli,
                                                         V11_afgelegde_afstand_Vr_10_juli, V11_calorieverbranding_Vr_10_juli, V11_hartslag_Vr_10_juli, 
                                                         V11_gemiddelde_snelheid_Vr_10_juli, V11_aantal_gelopen_traptreden_Vr_10_juli,
                                                         V11_afgelegde_afstand_Za_11_juli, V11_calorieverbranding_Za_11_juli, V11_hartslag_Za_11_juli, 
                                                         V11_gemiddelde_snelheid_Za_11_juli, V11_aantal_gelopen_traptreden_Za_11_juli,
                                                         V11_afgelegde_afstand_Zo_12_juli, V11_calorieverbranding_Zo_12_juli, V11_hartslag_Zo_12_juli, 
                                                         V11_gemiddelde_snelheid_Zo_12_juli, V11_aantal_gelopen_traptreden_Zo_12_juli))
colnames(DBEWEEG_complete)




##### valid data check ActivPAL

DBEWEEG_APdata$nvaliddays
table(DBEWEEG_APdata$nvaliddays, DBEWEEG_APdata$conditie)
#in condition 0 (batch 1, only activpal) there are 17 respondents with valid data (at least 5 valid days)
#in condition 1 (batch 2, activpal and trackers) there are 26 with valid activpal data (at least 5 valid days)
#there are 2 people in the dataset without any valid activpal days, and 1 person did never wear the activpal so is not included in the dataset 
#(that's why the data has 47 rows, while 48 of the invitees consented). Those without any valid activpal days were all in condition 0 (batch 1)


##### Save data

#Save as new datafile
write_sav(DBEWEEG_complete, "F:/Thesis/ResearchArchive_MScThesisRianneKraakman/3_ConvergentValidityAnalyses/3.1_data_preparation/Output/DBEWEEG_withAPdata_complete.sav")
